outline

  1. structure
  2. Data: dataset, preprocessing, visualisation,
  3. priminarly examination: paired correlation and spatial correlation, piared R squared, scatterplot
  4. machine learning methods: model parameter tunning, interpretation

The tutorial shows the processes of data exploration and data analysis for the global air pollution modeling project. The statistical learning methods used include random forest, stochastic gradient boosting, extreme gradient boosting, and a mechanical model using nonlinear regression. The partial dependence plot and variable importance are visualised to partly interpretate models.

Required packages

Auxilary package with preprocessed data in dataframe.

install_github("mengluchu/globalLUR/globalLUR/globalLUR")
library(globalLUR)
ls("package:globalLUR")
##  [1] "Brt_imp"          "Brt_LUR"          "cforest_LUR"     
##  [4] "countrywithppm"   "create_ring"      "ctree_LUR"       
##  [7] "error_matrix"     "join_by_id"       "Lasso"           
## [10] "Lassoselected"    "mechanical"       "merge_roads"     
## [13] "merged"           "mergeraster2file" "plot_error"      
## [16] "plot_rsq"         "ppm2ug"           "RDring_coef"     
## [19] "removedips"       "rf_imp"           "rf_LUR"          
## [22] "sampledf"         "scatterplot"      "subset_grep"     
## [25] "univar_rsq"       "xgboost_imp"      "xgboost_LUR"

If the above is not successeful for MacOS users, the following and a restart of R may be needed

system('defaults write org.R-project.R force.LANG en_US.UTF-8')

Preparation

Get 3 color pallets.

colorB = brewer.pal(7, "Greens")
colorG = brewer.pal(11, "PiYG")
colorS = brewer.pal(11, "Spectral")
#set.seed(2)

Dataset:

value_mean: annual mean NO2, day_value: annual mean NO2 at daytime, night_value: annual mean NO2 at night time.
1. road_XX_size: road lenght within a buffer with radius “size” of type XX.
2. I_size: Industrial area within a buffer with radius “size”.
3. Tropomi_2018: TROPOMI averaged over Feb 2018 - Jan 2019.
4. temperature_2m_m: monthly mean temperature at 2m height of month “m”.
5. wind_speed_10m_m:monthly mean wind speed at 2m height of month “m”.
6. pop1k/ 3k /5k: population 1, 3, 5 km resolution.
7. Rsp: Surface remote sensing and chemical transport model product.
8. OMI_mean_filt: OMI column density, 2017 annual average.

# add data usethis::use_data()
data("merged")
names(data)
## NULL
data("countrywithppm") # countries with ppm (parts per million)
summary(merged)
##     LATITUDE        LONGITUDE          value_mean     OMI_mean_filt      
##  Min.   :-36.92   Min.   :-122.463   Min.   : 1.559   Min.   :4.732e+14  
##  1st Qu.: 32.79   1st Qu.:   2.451   1st Qu.:17.262   1st Qu.:2.122e+15  
##  Median : 41.53   Median :  13.323   Median :26.164   Median :3.292e+15  
##  Mean   : 40.02   Mean   :  44.988   Mean   :27.905   Mean   :4.418e+15  
##  3rd Qu.: 48.65   3rd Qu.: 112.552   3rd Qu.:36.883   3rd Qu.:5.405e+15  
##  Max.   : 69.66   Max.   : 153.158   Max.   :93.885   Max.   :1.936e+16  
##                                                                          
##    elevation          pop1k            RSp         temperature_2m_1 
##  Min.   :-289.0   Min.   :    0   Min.   :-1.000   Min.   :-24.329  
##  1st Qu.:  28.0   1st Qu.: 1512   1st Qu.: 1.393   1st Qu.: -1.639  
##  Median :  90.0   Median : 3454   Median : 2.852   Median :  2.462  
##  Mean   : 240.7   Mean   : 5926   Mean   : 3.796   Mean   :  2.826  
##  3rd Qu.: 280.0   3rd Qu.: 6955   3rd Qu.: 5.391   3rd Qu.:  7.033  
##  Max.   :2905.0   Max.   :87147   Max.   :27.538   Max.   : 27.697  
##                                                                     
##  temperature_2m_10  temperature_2m_11 temperature_2m_12 temperature_2m_2 
##  Min.   : 0.07401   Min.   :-13.067   Min.   :-24.285   Min.   :-18.282  
##  1st Qu.:11.21953   1st Qu.:  5.208   1st Qu.:  1.772   1st Qu.:  2.869  
##  Median :13.65798   Median :  7.410   Median :  4.799   Median :  6.143  
##  Mean   :14.42080   Mean   :  8.504   Mean   :  4.700   Mean   :  6.018  
##  3rd Qu.:17.09752   3rd Qu.: 12.283   3rd Qu.:  7.733   3rd Qu.:  8.918  
##  Max.   :29.36926   Max.   : 27.786   Max.   : 26.249   Max.   : 28.373  
##                                                                          
##  temperature_2m_3 temperature_2m_4 temperature_2m_5 temperature_2m_6
##  Min.   :-7.370   Min.   :-1.022   Min.   : 3.573   Min.   : 5.45   
##  1st Qu.: 7.195   1st Qu.: 8.642   1st Qu.:14.818   1st Qu.:18.68   
##  Median : 9.235   Median :11.813   Median :16.577   Median :20.93   
##  Mean   : 9.342   Mean   :12.800   Mean   :17.795   Mean   :21.29   
##  3rd Qu.:11.455   3rd Qu.:17.392   3rd Qu.:21.876   3rd Qu.:24.49   
##  Max.   :30.274   Max.   :31.617   Max.   :33.735   Max.   :33.63   
##                                                                     
##  temperature_2m_7 temperature_2m_8 temperature_2m_9 wind_speed_10m_1
##  Min.   : 5.981   Min.   : 6.031   Min.   : 6.686   Min.   : 1.599  
##  1st Qu.:18.824   1st Qu.:18.337   1st Qu.:13.690   1st Qu.: 2.784  
##  Median :22.893   Median :21.328   Median :17.125   Median : 3.448  
##  Mean   :23.032   Mean   :22.257   Mean   :18.295   Mean   : 3.568  
##  3rd Qu.:27.729   3rd Qu.:26.946   3rd Qu.:23.065   3rd Qu.: 3.933  
##  Max.   :34.505   Max.   :33.713   Max.   :30.042   Max.   :10.171  
##                                                                     
##  wind_speed_10m_10 wind_speed_10m_11 wind_speed_10m_12 wind_speed_10m_2
##  Min.   : 1.592    Min.   : 1.538    Min.   : 1.388    Min.   :1.695   
##  1st Qu.: 2.737    1st Qu.: 2.820    1st Qu.: 2.864    1st Qu.:2.950   
##  Median : 3.451    Median : 3.504    Median : 3.704    Median :3.639   
##  Mean   : 3.751    Mean   : 3.676    Mean   : 4.032    Mean   :3.871   
##  3rd Qu.: 4.329    3rd Qu.: 4.168    3rd Qu.: 5.023    3rd Qu.:4.490   
##  Max.   :10.660    Max.   :10.380    Max.   :11.064    Max.   :9.831   
##                                                                        
##  wind_speed_10m_3 wind_speed_10m_4 wind_speed_10m_5 wind_speed_10m_6
##  Min.   :1.672    Min.   :1.825    Min.   :1.581    Min.   :1.518   
##  1st Qu.:2.921    1st Qu.:2.998    1st Qu.:2.724    1st Qu.:2.612   
##  Median :3.480    Median :3.347    Median :3.201    Median :3.146   
##  Mean   :3.674    Mean   :3.521    Mean   :3.329    Mean   :3.306   
##  3rd Qu.:4.273    3rd Qu.:3.894    3rd Qu.:3.772    3rd Qu.:3.746   
##  Max.   :8.556    Max.   :8.196    Max.   :6.705    Max.   :7.569   
##                                                                     
##  wind_speed_10m_7 wind_speed_10m_8 wind_speed_10m_9     pop5k        
##  Min.   :1.163    Min.   :1.330    Min.   :1.425    Min.   :      0  
##  1st Qu.:2.662    1st Qu.:2.481    1st Qu.:2.468    1st Qu.:  69519  
##  Median :3.153    Median :2.816    Median :2.954    Median : 173910  
##  Mean   :3.250    Mean   :2.995    Mean   :3.157    Mean   : 293613  
##  3rd Qu.:3.662    3rd Qu.:3.231    3rd Qu.:3.655    3rd Qu.: 345032  
##  Max.   :7.276    Max.   :6.844    Max.   :8.407    Max.   :3766416  
##                                                                      
##      pop3k            country       ROAD_1_25         ROAD_1_50      
##  Min.   :      0   CN     :1296   Min.   : -1.000   Min.   : -1.000  
##  1st Qu.:  32664   FR     : 636   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :  79828   ES     : 380   Median :  0.000   Median :  0.000  
##  Mean   : 131287   DE     : 338   Mean   :  2.744   Mean   :  6.902  
##  3rd Qu.: 163222   GB     : 131   3rd Qu.:  0.000   3rd Qu.:  0.000  
##  Max.   :1860921   AT     : 124   Max.   :300.650   Max.   :603.000  
##                    (Other): 731                                      
##    ROAD_1_100        ROAD_1_300       ROAD_1_500        ROAD_1_800   
##  Min.   :  -1.00   Min.   :  -1.0   Min.   :   -1.0   Min.   :   -1  
##  1st Qu.:   0.00   1st Qu.:   0.0   1st Qu.:    0.0   1st Qu.:    0  
##  Median :   0.00   Median :   0.0   Median :    0.0   Median :    0  
##  Mean   :  23.17   Mean   : 175.1   Mean   :  481.5   Mean   : 1207  
##  3rd Qu.:   0.00   3rd Qu.:   0.0   3rd Qu.:    0.0   3rd Qu.: 2010  
##  Max.   :1287.63   Max.   :7517.8   Max.   :13376.8   Max.   :20041  
##                                                                      
##   ROAD_1_1000     ROAD_1_3000      ROAD_1_5000       ROAD_2_25      
##  Min.   :   -1   Min.   :    -1   Min.   :    -1   Min.   : -1.000  
##  1st Qu.:    0   1st Qu.:  4380   1st Qu.: 22177   1st Qu.:  0.000  
##  Median :    0   Median : 14296   Median : 39887   Median :  0.000  
##  Mean   : 1928   Mean   : 17633   Mean   : 46610   Mean   :  5.394  
##  3rd Qu.: 3524   3rd Qu.: 25798   3rd Qu.: 63770   3rd Qu.:  0.000  
##  Max.   :25727   Max.   :112511   Max.   :273482   Max.   :330.800  
##                                                                     
##    ROAD_2_50        ROAD_2_100        ROAD_2_300       ROAD_2_500    
##  Min.   : -1.00   Min.   :  -1.00   Min.   :  -1.0   Min.   :  -1.0  
##  1st Qu.:  0.00   1st Qu.:   0.00   1st Qu.:   0.0   1st Qu.:   0.0  
##  Median :  0.00   Median :   0.00   Median :   0.0   Median :   0.0  
##  Mean   : 13.75   Mean   :  44.43   Mean   : 294.9   Mean   : 798.3  
##  3rd Qu.:  0.00   3rd Qu.:   0.00   3rd Qu.: 483.3   3rd Qu.:1500.8  
##  Max.   :559.69   Max.   :1318.85   Max.   :4161.3   Max.   :6372.3  
##                                                                      
##    ROAD_2_800     ROAD_2_1000     ROAD_2_3000      ROAD_2_5000    
##  Min.   :   -1   Min.   :   -1   Min.   :    -1   Min.   :    -1  
##  1st Qu.:    0   1st Qu.:    0   1st Qu.:  7012   1st Qu.: 17286  
##  Median : 1146   Median : 2061   Median : 16005   Median : 35261  
##  Mean   : 1898   Mean   : 2852   Mean   : 19887   Mean   : 44942  
##  3rd Qu.: 3156   3rd Qu.: 4397   3rd Qu.: 28344   3rd Qu.: 62093  
##  Max.   :13732   Max.   :20693   Max.   :107719   Max.   :251141  
##                                                                   
##    ROAD_3_25       ROAD_3_50        ROAD_3_100       ROAD_3_300    
##  Min.   : -1.0   Min.   : -1.00   Min.   : -1.00   Min.   :  -1.0  
##  1st Qu.:  0.0   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:   0.0  
##  Median :  0.0   Median :  0.00   Median :  0.00   Median :   0.0  
##  Mean   :  5.8   Mean   : 14.57   Mean   : 49.69   Mean   : 390.8  
##  3rd Qu.:  0.0   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.: 653.7  
##  Max.   :259.6   Max.   :447.69   Max.   :973.28   Max.   :3767.1  
##                                                                    
##    ROAD_3_500       ROAD_3_800        ROAD_3_1000       ROAD_3_3000    
##  Min.   :  -1.0   Min.   :   -1.00   Min.   :   -1.0   Min.   :    -1  
##  1st Qu.:   0.0   1st Qu.:   61.29   1st Qu.:  883.8   1st Qu.: 10603  
##  Median : 770.1   Median : 1987.41   Median : 3187.6   Median : 21942  
##  Mean   :1066.5   Mean   : 2540.52   Mean   : 3840.6   Mean   : 25476  
##  3rd Qu.:1786.0   3rd Qu.: 3890.31   3rd Qu.: 5769.2   3rd Qu.: 36173  
##  Max.   :9253.2   Max.   :15623.34   Max.   :20488.8   Max.   :117425  
##                                                                        
##   ROAD_3_5000       ROAD_4_25         ROAD_4_50        ROAD_4_100    
##  Min.   :    -1   Min.   : -1.000   Min.   : -1.00   Min.   : -1.00  
##  1st Qu.: 25440   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median : 46484   Median :  0.000   Median :  0.00   Median :  0.00  
##  Mean   : 55963   Mean   :  4.595   Mean   : 11.89   Mean   : 43.48  
##  3rd Qu.: 76207   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  1.84  
##  Max.   :290257   Max.   :295.920   Max.   :608.00   Max.   :931.12  
##                                                                      
##    ROAD_4_300       ROAD_4_500       ROAD_4_800       ROAD_4_1000   
##  Min.   :  -1.0   Min.   :  -1.0   Min.   :   -1.0   Min.   :   -1  
##  1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:  785.2   1st Qu.: 1421  
##  Median : 166.6   Median : 850.1   Median : 2104.2   Median : 3352  
##  Mean   : 392.3   Mean   :1085.7   Mean   : 2662.0   Mean   : 4052  
##  3rd Qu.: 630.6   3rd Qu.:1718.6   3rd Qu.: 3914.7   3rd Qu.: 5898  
##  Max.   :3559.8   Max.   :8337.4   Max.   :19735.0   Max.   :28916  
##                                                                     
##   ROAD_4_3000      ROAD_4_5000       ROAD_5_25       ROAD_5_50     
##  Min.   :    -1   Min.   :    -1   Min.   : -1.0   Min.   : -1.00  
##  1st Qu.: 13866   1st Qu.: 33964   1st Qu.:  0.0   1st Qu.:  0.00  
##  Median : 24213   Median : 54814   Median :  0.0   Median :  0.00  
##  Mean   : 27911   Mean   : 64167   Mean   : 16.1   Mean   : 42.13  
##  3rd Qu.: 37575   3rd Qu.: 84278   3rd Qu.: 25.0   3rd Qu.: 75.83  
##  Max.   :167554   Max.   :318870   Max.   :186.8   Max.   :405.45  
##                                                                    
##    ROAD_5_100        ROAD_5_300        ROAD_5_500        ROAD_5_800   
##  Min.   :  -1.00   Min.   :   -1.0   Min.   :   -1.0   Min.   :   -1  
##  1st Qu.:   0.00   1st Qu.:   98.5   1st Qu.:  712.1   1st Qu.: 2073  
##  Median :  83.55   Median : 1178.4   Median : 3444.5   Median : 8441  
##  Mean   : 162.68   Mean   : 1521.5   Mean   : 4241.5   Mean   :10184  
##  3rd Qu.: 282.86   3rd Qu.: 2517.5   3rd Qu.: 6888.9   3rd Qu.:16234  
##  Max.   :1361.67   Max.   :10474.6   Max.   :27457.3   Max.   :62956  
##                                                                       
##   ROAD_5_1000     ROAD_5_3000      ROAD_5_5000          I_1_25      
##  Min.   :   -1   Min.   :    -1   Min.   :     -1   Min.   :  -1.0  
##  1st Qu.: 3451   1st Qu.: 24529   1st Qu.:  54316   1st Qu.:   0.0  
##  Median :12659   Median : 72484   Median : 155987   Median :   0.0  
##  Mean   :15243   Mean   : 96543   Mean   : 212826   Mean   : 107.4  
##  3rd Qu.:24161   3rd Qu.:151711   3rd Qu.: 312072   3rd Qu.:   0.0  
##  Max.   :93332   Max.   :554192   Max.   :1496360   Max.   :3125.0  
##                                                                     
##      I_1_50          I_1_100         I_1_300          I_1_500      
##  Min.   :  -1.0   Min.   :   -1   Min.   :    -1   Min.   :    -1  
##  1st Qu.:   0.0   1st Qu.:    0   1st Qu.:     0   1st Qu.:     0  
##  Median :   0.0   Median :    0   Median :     0   Median :     0  
##  Mean   : 280.4   Mean   : 1071   Mean   : 10390   Mean   : 31411  
##  3rd Qu.:   0.0   3rd Qu.:    0   3rd Qu.:     0   3rd Qu.:  1875  
##  Max.   :8125.0   Max.   :30625   Max.   :275625   Max.   :785625  
##                                                                    
##     I_1_800           I_1_1000          I_1_3000           I_1_5000       
##  Min.   :     -1   Min.   :     -1   Min.   :      -1   Min.   :      -1  
##  1st Qu.:      0   1st Qu.:      0   1st Qu.:    8125   1st Qu.:  349375  
##  Median :      0   Median :      0   Median :  624375   Median : 2037500  
##  Mean   :  87671   Mean   : 143734   Mean   : 1325656   Mean   : 3140991  
##  3rd Qu.:  65000   3rd Qu.: 143281   3rd Qu.: 1948906   3rd Qu.: 4803750  
##  Max.   :2005625   Max.   :2968750   Max.   :14392500   Max.   :30615000  
##                                                                           
##   Tropomi_2018       day_value       night_value             ID        
##  Min.   :  82.75   Min.   : 1.883   Min.   :  0.4307   Min.   :   1.0  
##  1st Qu.: 263.42   1st Qu.:18.353   1st Qu.: 15.4550   1st Qu.: 909.8  
##  Median : 391.67   Median :28.433   Median : 22.3804   Median :1818.5  
##  Mean   : 481.66   Mean   :30.912   Mean   : 23.5377   Mean   :1818.5  
##  3rd Qu.: 648.00   3rd Qu.:41.900   3rd Qu.: 29.6922   3rd Qu.:2727.2  
##  Max.   :1441.78   Max.   :94.266   Max.   :100.5855   Max.   :3636.0  
##                    NA's   :5        NA's   :1
datatable(merged, rownames = FALSE, filter = "top", options = list(pageLength = 5, scrollX = T))

Fill missing data with NA

merged_1 = na_if(merged, -1)

Merge roads of road type 3, 4, 5, the road length of these road types are aggregated. The original road types are substituted (with keep =T, they are remained).

merged_mr = merge_roads(merged_1, c(3, 4, 5), keep = F) # keep = T keeps the original roads. 
names(merged_mr)
##  [1] "ROAD_M345_25"      "ROAD_M345_50"      "ROAD_M345_100"    
##  [4] "ROAD_M345_300"     "ROAD_M345_500"     "ROAD_M345_800"    
##  [7] "ROAD_M345_1000"    "ROAD_M345_3000"    "ROAD_M345_5000"   
## [10] "LATITUDE"          "LONGITUDE"         "value_mean"       
## [13] "OMI_mean_filt"     "elevation"         "pop1k"            
## [16] "RSp"               "temperature_2m_1"  "temperature_2m_10"
## [19] "temperature_2m_11" "temperature_2m_12" "temperature_2m_2" 
## [22] "temperature_2m_3"  "temperature_2m_4"  "temperature_2m_5" 
## [25] "temperature_2m_6"  "temperature_2m_7"  "temperature_2m_8" 
## [28] "temperature_2m_9"  "wind_speed_10m_1"  "wind_speed_10m_10"
## [31] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_2" 
## [34] "wind_speed_10m_3"  "wind_speed_10m_4"  "wind_speed_10m_5" 
## [37] "wind_speed_10m_6"  "wind_speed_10m_7"  "wind_speed_10m_8" 
## [40] "wind_speed_10m_9"  "pop5k"             "pop3k"            
## [43] "country"           "ROAD_1_25"         "ROAD_1_50"        
## [46] "ROAD_1_100"        "ROAD_1_300"        "ROAD_1_500"       
## [49] "ROAD_1_800"        "ROAD_1_1000"       "ROAD_1_3000"      
## [52] "ROAD_1_5000"       "ROAD_2_25"         "ROAD_2_50"        
## [55] "ROAD_2_100"        "ROAD_2_300"        "ROAD_2_500"       
## [58] "ROAD_2_800"        "ROAD_2_1000"       "ROAD_2_3000"      
## [61] "ROAD_2_5000"       "I_1_25"            "I_1_50"           
## [64] "I_1_100"           "I_1_300"           "I_1_500"          
## [67] "I_1_800"           "I_1_1000"          "I_1_3000"         
## [70] "I_1_5000"          "Tropomi_2018"      "day_value"        
## [73] "night_value"       "ID"
#numeric country
#inde_var$country=as.numeric(inde_var$country)

Visualization

Visualize with tmap: convenient

locations_sf = st_as_sf(merged_mr, coords = c("LONGITUDE","LATITUDE"))
osm_valuemean = tm_shape(locations_sf) +
  tm_dots( "value_mean", col = "value_mean", size = 0.05,
     popup.vars = c("value_mean", "day_value", "night_value", "ROAD_2_100", "ROAD_2_5000")) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "NO2mean.html")

Visualize with leaflet: more control. show Day/night ratio, red: day/night >1, blue, day/nigh <1

merged_fp = merged_mr %>% mutate(ratiodn = day_value/night_value) %>% mutate(color = ifelse(ratiodn >1, "red", "blue"))
m  = leaflet(merged_fp) %>%
     addTiles() %>% addCircleMarkers(radius = ~value_mean/5, color = ~color, popup =   ~as.character(value_mean),fill = FALSE) %>% addProviderTiles(providers$Esri.NatGeoWorldMap) %>% addMouseCoordinates() %>%  
addHomeButton(ext = extent(116.2, 117, 39.7, 40), layer.name = "Beijing") %>% addHomeButton(ext = extent(5, 5.2, 52, 52.2), layer.name = "Utrecht")
saveWidget(m, file = "NO2daynight.html")

Boxplot

countryname = paste(merged_mr$country, countrycode(merged_mr$country, 'iso2c', 'country.name'), sep = ":") 

#tag country with ppm 
countryname_s_e=ifelse( merged_mr$country %in% countrywithppm[countrywithppm %in% merged_mr$country], paste(countryname,"*", sep = ""), countryname)
merged_mr$countryfullname = countryname_s_e

# use the median for colour
mergedmedian = merged_mr %>% group_by(country) %>% mutate(median =  median(value_mean, na.rm = TRUE))

bp2 <- ggplot(mergedmedian, aes(x=countryfullname, y=value_mean, group=country)) +
  labs(x = "Country", y = expression(paste("NO"[2], "  ", mu, "g/", "m"^3)), cex = 1.5) +
  geom_boxplot(aes(fill = median)) + 
  theme(text = element_text(size = 13), axis.text.x = element_text(angle = 90, hjust = 1)) +   scale_fill_distiller(palette = "Spectral")
#   scale_color_brewer(direction = 1)
print(bp2 + ylim(0, 100))

Plot the paired correlation, for road predictors, population, Tropomi. For DE, CN, and world

merged_mr %>% na.omit %>% filter(country == "DE") %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)

merged_mr %>% na.omit %>% filter(country == "CN") %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)

merged_mr %>% na.omit %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)

Spatial dependency

grd_sp <- as_Spatial(locations_sf)
dt.vgm = variogram(value_mean~1, grd_sp)
plot(dt.vgm)

dt.vgm = variogram(value_mean~1, grd_sp, cutoff = 10)
plot(dt.vgm)

countryvariogram = function(COUN, cutoff){
loca =  locations_sf%>%filter(country == COUN)
grd_sp <- as_Spatial(loca)

dt.vgm = variogram(value_mean~1, grd_sp, cutoff = cutoff)
plot(dt.vgm)
}
 
countryvariogram("DE", 1)

countryvariogram("US", 1)

countryvariogram("CN", 1) # reason?

#Moran I test
#install.packages("ape", dependencies = TRUE)
#library(ape)

#merged_mrf =  merged_mr%>%filter(country == "US")
#no2.dists <- as.matrix(dist(cbind(merged_mrf$LONGITUDE, merged_mrf$LATITUDE)))
#no2.dists[1:5, 1:5]
#no2.inv <- 1/no2.dists
#diag(no2.inv) <- 0
#no2.inv[1:5, 1:5]
#Moran.I(merged_mrf$value_mean, na.rm = T, no2.inv) 

Data preprocessing:

  1. add variables by ID or by rasters (not in this document).
  2. remove unwanted columns or records,
  3. select records (e.g. by country), separate testing and training sets.

Separate the dataset into training and test dataset with a fraction (her 80% of the records are used for training, the rest for testing), “DE” is the two digit for germany. If for world, the sampling uses the fraction per country.

#merged = merge(merged, stat[,-which(names(stat)%in%c("LATITUTE", "LONGITUDE"))], by = "ID", all.x = T)

Germany as an example

response_predictor = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "DE", grepstring_rm = "ID|LATITUDE|LONGITUDE|countryfullname")

#Retrieve test, training, and all variables.  
 
test = response_predictor$test
training = response_predictor$training
inde_var = response_predictor$inde_var
inde_var = inde_var %>% dplyr::select(-country)

The size of test and training dataset

length(test)
## [1] 67
length(training)
## [1] 267

The paired correlation between dependent (mean, day, night) and independent variables. How much information does R-squared tell you?

#Checkt uni-variant R square. Caculate the r-sq for day, night and mean, and bind the columns to form a dataframe for plotting.  
rsqmean =  inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>%  univar_rsq (inde_var$value_mean)

rsqday = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>%  univar_rsq (inde_var$day_value) 
rsqnight = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>%  univar_rsq (inde_var$night_value) 

rsqdf = cbind(rsqmean, rsqday, rsqnight, rownames(rsqmean))  
names(rsqdf)= c("mean","day","night","vars")

plot_rsq(rsqdf = rsqdf, varname = "vars",xlab = "predictors", ylab = "R-squared")

#How does it compare to the vairable importance estimated from LASSO, RF, SGB, XGB, etc.

The scatter plots between predictors and responses, mean

inde_var %>% dplyr::select(matches("ROAD_M345_3000|pop3k|ROAD_2_50$|temperature_2m_7|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")

inde_var %>% dplyr::select(matches("Tro|OMI|Rsp|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")

# can choose any variable to look at the scatterplot

#inde_var %>% dplyr::select(matches("ROAD_1|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")

#inde_var %>% dplyr::select(matches("ROAD_2|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")

#scatterplot(inde_var, "night_value", "gam")
#scatterplot(inde_var,"value_mean", "gam" )

Modelling

  1. Tree based
  2. Lasso
  3. Mechanical model (nls)

Extra: 5) Separate urban/rural hirachical/ two-step linear regression 6) mixed effects regression

LM: linear regression model

If simply using linear regression, the mean, day, night. Predictors are population, temperature, wind speed, GEOM product, OMI tropo column, elevation, and road buffers.

i.e. ROAD|population|value_mean|temperature|wind|GEOM product|OMI|elevation.

Note population is not always significant, though the individual R square for each buffer is high. The prediction for night is much better than for the day

inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")
Regression tree

The tree and prediction error will be different if shuffeling training and testing data.

for (i in 2:5)
{
  set.seed(i)
  testtree = globalLUR::sampledf(merged_mr,fraction = 0.8, "DE" )
  with (testtree, ctree_LUR(inde_var, y_varname= c("day_value"), training = training, test =  test, grepstring ="ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi" ))
}
random forest.

Creates diverse set of trees because 1) trees are instable w.r.t. changes in learning/training data (bagging) 2) randomly preselect mtry splitting variables in each split

model training and parameter tuning

#caret
names(getModelInfo())
##   [1] "ada"                 "AdaBag"              "AdaBoost.M1"        
##   [4] "adaboost"            "amdai"               "ANFIS"              
##   [7] "avNNet"              "awnb"                "awtan"              
##  [10] "bag"                 "bagEarth"            "bagEarthGCV"        
##  [13] "bagFDA"              "bagFDAGCV"           "bam"                
##  [16] "bartMachine"         "bayesglm"            "binda"              
##  [19] "blackboost"          "blasso"              "blassoAveraged"     
##  [22] "bridge"              "brnn"                "BstLm"              
##  [25] "bstSm"               "bstTree"             "C5.0"               
##  [28] "C5.0Cost"            "C5.0Rules"           "C5.0Tree"           
##  [31] "cforest"             "chaid"               "CSimca"             
##  [34] "ctree"               "ctree2"              "cubist"             
##  [37] "dda"                 "deepboost"           "DENFIS"             
##  [40] "dnn"                 "dwdLinear"           "dwdPoly"            
##  [43] "dwdRadial"           "earth"               "elm"                
##  [46] "enet"                "evtree"              "extraTrees"         
##  [49] "fda"                 "FH.GBML"             "FIR.DM"             
##  [52] "foba"                "FRBCS.CHI"           "FRBCS.W"            
##  [55] "FS.HGD"              "gam"                 "gamboost"           
##  [58] "gamLoess"            "gamSpline"           "gaussprLinear"      
##  [61] "gaussprPoly"         "gaussprRadial"       "gbm_h2o"            
##  [64] "gbm"                 "gcvEarth"            "GFS.FR.MOGUL"       
##  [67] "GFS.LT.RS"           "GFS.THRIFT"          "glm.nb"             
##  [70] "glm"                 "glmboost"            "glmnet_h2o"         
##  [73] "glmnet"              "glmStepAIC"          "gpls"               
##  [76] "hda"                 "hdda"                "hdrda"              
##  [79] "HYFIS"               "icr"                 "J48"                
##  [82] "JRip"                "kernelpls"           "kknn"               
##  [85] "knn"                 "krlsPoly"            "krlsRadial"         
##  [88] "lars"                "lars2"               "lasso"              
##  [91] "lda"                 "lda2"                "leapBackward"       
##  [94] "leapForward"         "leapSeq"             "Linda"              
##  [97] "lm"                  "lmStepAIC"           "LMT"                
## [100] "loclda"              "logicBag"            "LogitBoost"         
## [103] "logreg"              "lssvmLinear"         "lssvmPoly"          
## [106] "lssvmRadial"         "lvq"                 "M5"                 
## [109] "M5Rules"             "manb"                "mda"                
## [112] "Mlda"                "mlp"                 "mlpKerasDecay"      
## [115] "mlpKerasDecayCost"   "mlpKerasDropout"     "mlpKerasDropoutCost"
## [118] "mlpML"               "mlpSGD"              "mlpWeightDecay"     
## [121] "mlpWeightDecayML"    "monmlp"              "msaenet"            
## [124] "multinom"            "mxnet"               "mxnetAdam"          
## [127] "naive_bayes"         "nb"                  "nbDiscrete"         
## [130] "nbSearch"            "neuralnet"           "nnet"               
## [133] "nnls"                "nodeHarvest"         "null"               
## [136] "OneR"                "ordinalNet"          "ordinalRF"          
## [139] "ORFlog"              "ORFpls"              "ORFridge"           
## [142] "ORFsvm"              "ownn"                "pam"                
## [145] "parRF"               "PART"                "partDSA"            
## [148] "pcaNNet"             "pcr"                 "pda"                
## [151] "pda2"                "penalized"           "PenalizedLDA"       
## [154] "plr"                 "pls"                 "plsRglm"            
## [157] "polr"                "ppr"                 "PRIM"               
## [160] "protoclass"          "qda"                 "QdaCov"             
## [163] "qrf"                 "qrnn"                "randomGLM"          
## [166] "ranger"              "rbf"                 "rbfDDA"             
## [169] "Rborist"             "rda"                 "regLogistic"        
## [172] "relaxo"              "rf"                  "rFerns"             
## [175] "RFlda"               "rfRules"             "ridge"              
## [178] "rlda"                "rlm"                 "rmda"               
## [181] "rocc"                "rotationForest"      "rotationForestCp"   
## [184] "rpart"               "rpart1SE"            "rpart2"             
## [187] "rpartCost"           "rpartScore"          "rqlasso"            
## [190] "rqnc"                "RRF"                 "RRFglobal"          
## [193] "rrlda"               "RSimca"              "rvmLinear"          
## [196] "rvmPoly"             "rvmRadial"           "SBC"                
## [199] "sda"                 "sdwd"                "simpls"             
## [202] "SLAVE"               "slda"                "smda"               
## [205] "snn"                 "sparseLDA"           "spikeslab"          
## [208] "spls"                "stepLDA"             "stepQDA"            
## [211] "superpc"             "svmBoundrangeString" "svmExpoString"      
## [214] "svmLinear"           "svmLinear2"          "svmLinear3"         
## [217] "svmLinearWeights"    "svmLinearWeights2"   "svmPoly"            
## [220] "svmRadial"           "svmRadialCost"       "svmRadialSigma"     
## [223] "svmRadialWeights"    "svmSpectrumString"   "tan"                
## [226] "tanSearch"           "treebag"             "vbmpRadial"         
## [229] "vglmAdjCat"          "vglmContRatio"       "vglmCumulative"     
## [232] "widekernelpls"       "WM"                  "wsrf"               
## [235] "xgbDART"             "xgbLinear"           "xgbTree"            
## [238] "xyf"
inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")

Training RF using Caret

Mtry

model_rf = train(value_mean ~ ., data = inde_var_train, method='rf') # mtry
plot(model_rf)

fitted <- predict(model_rf, inde_var[test, ])
error_matrix(prediction = fitted, validation = inde_var[test, ]$value_mean)
model_gbm = train(value_mean ~ ., data = inde_var[training, ], method='gbm')
plot(model_gbm)
#gbm.step optimal number of trees. 
#install.packages("caretEnsemble")
library(caretEnsemble)

# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method = "repeatedcv", 
                             number = 10, 
                             repeats = 2,
                             savePredictions = TRUE, 
                             classProbs = TRUE)

algorithmList <- c('rf', 'adaboost', 'earth', 'xgbDART', 'svmRadial')

set.seed(100)
models <- caretList(value_mean ~ ., data = inde_var_train, trControl = trainControl, methodList = algorithmList) 
results <- resamples(models)
summary(results)

Important variables and Partial plots: using the “vip” package

set.seed(2)
vip::list_metrics()
##     Metric                       Description                      Task
## 1      auc            Area under (ROC) curve     Binary classification
## 2    error           Misclassification error     Binary classification
## 3  logloss                          Log loss     Binary classification
## 4     mauc Multiclass area under (ROC) curve Multiclass classification
## 5 mlogloss               Multiclass log loss Multiclass classification
## 6      mse                Mean squared error                Regression
## 7       r2                         R squared                Regression
## 8 rsquared                         R squared                Regression
## 9     rmse           Root mean squared error                Regression
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000,importance = "permutation")
rf
## Ranger result
## 
## Call:
##  ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000,      importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  2000 
## Sample size:                      267 
## Number of independent variables:  65 
## Mtry:                             33 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       55.38431 
## R squared (OOB):                  0.6551465
# ranger method
importance(rf)
##      ROAD_M345_25      ROAD_M345_50     ROAD_M345_100     ROAD_M345_300 
##      1.5937397995      0.3869103843      1.0238120394      4.0543400979 
##     ROAD_M345_500     ROAD_M345_800    ROAD_M345_1000    ROAD_M345_3000 
##      0.6517413989      0.3489697790      0.8390844666     34.9617743923 
##    ROAD_M345_5000         elevation             pop1k  temperature_2m_1 
##      6.4888193292      0.5530545048      5.4438423606      0.1180595682 
## temperature_2m_10 temperature_2m_11 temperature_2m_12  temperature_2m_2 
##      0.8166281186      0.2851219354      0.2536844795      0.2151382204 
##  temperature_2m_3  temperature_2m_4  temperature_2m_5  temperature_2m_6 
##      0.0393546928      0.0561927641      0.0358757873      0.2495222502 
##  temperature_2m_7  temperature_2m_8  temperature_2m_9  wind_speed_10m_1 
##     -0.1067887579     -0.0298316086      0.1039563733     -0.0634048175 
## wind_speed_10m_10 wind_speed_10m_11 wind_speed_10m_12  wind_speed_10m_2 
##      1.4353714203      0.2690264802      0.1028837866      0.9881879592 
##  wind_speed_10m_3  wind_speed_10m_4  wind_speed_10m_5  wind_speed_10m_6 
##      0.7102671931      0.3061532104      0.1843722335      0.8492074342 
##  wind_speed_10m_7  wind_speed_10m_8  wind_speed_10m_9             pop5k 
##      0.5071510735      1.7267515495      1.8329691498      5.2327781093 
##             pop3k         ROAD_1_25         ROAD_1_50        ROAD_1_100 
##     17.0247538280      0.0044800733      0.0246943849      0.0595378317 
##        ROAD_1_300        ROAD_1_500        ROAD_1_800       ROAD_1_1000 
##      0.0584749800      0.3037501036      0.1112666838      0.0757891949 
##       ROAD_1_3000       ROAD_1_5000         ROAD_2_25         ROAD_2_50 
##      1.2379497813      5.0620814121     12.1620790590     30.9688035370 
##        ROAD_2_100        ROAD_2_300        ROAD_2_500        ROAD_2_800 
##     11.8755215392      2.0319057664      0.9026689173      0.4299128973 
##       ROAD_2_1000       ROAD_2_3000       ROAD_2_5000            I_1_25 
##     -0.0136316676      1.1113183194      1.0525460847     -0.0022763541 
##            I_1_50           I_1_100           I_1_300           I_1_500 
##      0.0189187343     -0.0009304509      0.0500863673      0.1097465448 
##           I_1_800          I_1_1000          I_1_3000          I_1_5000 
##      0.1519352991      0.0647437891     -0.0350714075      0.7847547256 
##      Tropomi_2018 
##      1.2565969636
#vip
DF_P_r2 = vi(rf, method = "permute", target = "value_mean", metric = "r2" ) # very clear what method is used and what is the target
DF_P_rmse = vi(rf, method = "permute", target = "value_mean", metric = "rmse") 

vip (DF_P_rmse)

vip (DF_P_r2)

partial dependence plots: all the variables. (using sparklines takes a while)

library(DT)
library(sparkline)
a=add_sparklines(DF, fit = rf)
library(htmlwidgets)
saveWidget(a, file="sparkline.html")

Partial dependence plot of selected variables

pre_mat_s = inde_var_train %>% select(value_mean, ROAD_2_50, pop3k, ROAD_M345_300) 

 
lm_s = lm(value_mean~., data = pre_mat_s)

rf_s = ranger(value_mean~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation")
rf_s
## Ranger result
## 
## Call:
##  ranger(value_mean ~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  2000 
## Sample size:                      267 
## Number of independent variables:  3 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       65.25373 
## R squared (OOB):                  0.593694

correlation

pre_mat_predictor = pre_mat_s%>%select(-value_mean) 
ggpairs(pre_mat_predictor)

p_lm = partial(lm_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p_lm)

p2 = partial(rf_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p2)

#slow
pd <- partial(rf_s, pred.var = c("pop3k", "ROAD_M345_300"  ))

# Default PDP
pd1 = plotPartial(pd)

# Add contour lines and use a different color palette
rwb <- colorRampPalette(c("red", "white", "blue"))
pdp2 = plotPartial(pd, contour = TRUE, col.regions = rwb)
 
# 3-D surface
#pdp3 <- plotPartial(pd, levelplot = F, zlab = "ROAD_1_50", colorkey = T, 
#                   screen = list(z = -20, x = -60) )
 
p3 = partial(rf_s, "ROAD_2_50", plot = TRUE, rug = TRUE)
p1 = partial(rf_s, "pop3k", plot = TRUE, rug = TRUE)
grid.arrange(p1, p2, p3, pd1, pdp2, ncol = 2)

Gradient boosting

pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")

gbm1 =  gbm(formula = value_mean~., data = pre_mat, distribution = "gaussian", n.trees = 2000,
  interaction.depth = 6, shrinkage = 0.01, bag.fraction = 0.5 )
names(pre_mat)
##  [1] "ROAD_M345_25"      "ROAD_M345_50"      "ROAD_M345_100"    
##  [4] "ROAD_M345_300"     "ROAD_M345_500"     "ROAD_M345_800"    
##  [7] "ROAD_M345_1000"    "ROAD_M345_3000"    "ROAD_M345_5000"   
## [10] "value_mean"        "elevation"         "pop1k"            
## [13] "temperature_2m_1"  "temperature_2m_10" "temperature_2m_11"
## [16] "temperature_2m_12" "temperature_2m_2"  "temperature_2m_3" 
## [19] "temperature_2m_4"  "temperature_2m_5"  "temperature_2m_6" 
## [22] "temperature_2m_7"  "temperature_2m_8"  "temperature_2m_9" 
## [25] "wind_speed_10m_1"  "wind_speed_10m_10" "wind_speed_10m_11"
## [28] "wind_speed_10m_12" "wind_speed_10m_2"  "wind_speed_10m_3" 
## [31] "wind_speed_10m_4"  "wind_speed_10m_5"  "wind_speed_10m_6" 
## [34] "wind_speed_10m_7"  "wind_speed_10m_8"  "wind_speed_10m_9" 
## [37] "pop5k"             "pop3k"             "ROAD_1_25"        
## [40] "ROAD_1_50"         "ROAD_1_100"        "ROAD_1_300"       
## [43] "ROAD_1_500"        "ROAD_1_800"        "ROAD_1_1000"      
## [46] "ROAD_1_3000"       "ROAD_1_5000"       "ROAD_2_25"        
## [49] "ROAD_2_50"         "ROAD_2_100"        "ROAD_2_300"       
## [52] "ROAD_2_500"        "ROAD_2_800"        "ROAD_2_1000"      
## [55] "ROAD_2_3000"       "ROAD_2_5000"       "I_1_25"           
## [58] "I_1_50"            "I_1_100"           "I_1_300"          
## [61] "I_1_500"           "I_1_800"           "I_1_1000"         
## [64] "I_1_3000"          "I_1_5000"          "Tropomi_2018"
summary(gbm1)

##                                 var      rel.inf
## ROAD_M345_3000       ROAD_M345_3000 18.635109805
## ROAD_2_50                 ROAD_2_50  7.245938393
## pop3k                         pop3k  6.647881497
## ROAD_2_100               ROAD_2_100  5.528536677
## ROAD_1_5000             ROAD_1_5000  4.286134230
## pop1k                         pop1k  3.862034030
## ROAD_2_25                 ROAD_2_25  3.198065949
## ROAD_M345_5000       ROAD_M345_5000  3.147119762
## ROAD_M345_1000       ROAD_M345_1000  2.653005426
## ROAD_M345_300         ROAD_M345_300  2.637273199
## pop5k                         pop5k  2.573345217
## ROAD_M345_100         ROAD_M345_100  2.279573049
## Tropomi_2018           Tropomi_2018  2.113698640
## ROAD_M345_500         ROAD_M345_500  1.968656395
## ROAD_1_3000             ROAD_1_3000  1.959604922
## ROAD_M345_25           ROAD_M345_25  1.795781607
## elevation                 elevation  1.541735961
## ROAD_2_5000             ROAD_2_5000  1.500913379
## ROAD_2_3000             ROAD_2_3000  1.437845215
## ROAD_2_1000             ROAD_2_1000  1.430160424
## wind_speed_10m_7   wind_speed_10m_7  1.419919919
## ROAD_M345_800         ROAD_M345_800  1.398825439
## ROAD_2_300               ROAD_2_300  1.233401403
## temperature_2m_6   temperature_2m_6  1.231989243
## I_1_3000                   I_1_3000  1.159601408
## I_1_1000                   I_1_1000  1.086322996
## ROAD_2_800               ROAD_2_800  1.063737889
## ROAD_2_500               ROAD_2_500  0.919873722
## I_1_5000                   I_1_5000  0.897750484
## wind_speed_10m_1   wind_speed_10m_1  0.875728094
## ROAD_M345_50           ROAD_M345_50  0.760418968
## wind_speed_10m_9   wind_speed_10m_9  0.681294119
## ROAD_1_500               ROAD_1_500  0.667697713
## temperature_2m_4   temperature_2m_4  0.665306251
## I_1_800                     I_1_800  0.626670237
## temperature_2m_8   temperature_2m_8  0.616876998
## wind_speed_10m_11 wind_speed_10m_11  0.573138320
## temperature_2m_2   temperature_2m_2  0.565620301
## wind_speed_10m_3   wind_speed_10m_3  0.543444198
## wind_speed_10m_4   wind_speed_10m_4  0.529655104
## wind_speed_10m_8   wind_speed_10m_8  0.525294128
## wind_speed_10m_10 wind_speed_10m_10  0.515074744
## I_1_500                     I_1_500  0.480113722
## temperature_2m_7   temperature_2m_7  0.432547878
## temperature_2m_1   temperature_2m_1  0.430331582
## temperature_2m_9   temperature_2m_9  0.426148696
## temperature_2m_3   temperature_2m_3  0.383272647
## temperature_2m_5   temperature_2m_5  0.329065001
## wind_speed_10m_5   wind_speed_10m_5  0.325021929
## wind_speed_10m_12 wind_speed_10m_12  0.316548711
## ROAD_1_800               ROAD_1_800  0.283225900
## temperature_2m_12 temperature_2m_12  0.270442849
## ROAD_1_1000             ROAD_1_1000  0.253241626
## wind_speed_10m_2   wind_speed_10m_2  0.240541771
## temperature_2m_10 temperature_2m_10  0.229540226
## ROAD_1_300               ROAD_1_300  0.208372015
## wind_speed_10m_6   wind_speed_10m_6  0.187093811
## I_1_300                     I_1_300  0.107942047
## temperature_2m_11 temperature_2m_11  0.082347925
## I_1_50                       I_1_50  0.007411669
## I_1_100                     I_1_100  0.006734537
## ROAD_1_25                 ROAD_1_25  0.000000000
## ROAD_1_50                 ROAD_1_50  0.000000000
## ROAD_1_100               ROAD_1_100  0.000000000
## I_1_25                       I_1_25  0.000000000
plot(gbm1, i.var = 2:3)

#plot(gbm1, i.var = 1) 
#rf_residual <- pre_rf -  rdf_test$NO2

Xgboost

Tunning XGBoost is more complex (as it has a lot more hyperparameters to tune): https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

  1. gamma[default=0][range: (0,Inf)] It controls regularization (or prevents overfitting). The optimal value of gamma depends on the data set and other parameter values. Higher the value, higher the regularization. Regularization means penalizing large coefficients which don’t improve the model’s performance. default = 0 means no regularization. Tune trick: Start with 0 and check CV error rate. If you see train error >>> test error, bring gamma into action.

  2. lambda and Alpha

  3. nrounds[default=100] It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow. Should be tuned using CV

  4. eta[default=0.3][range: (0,1)] It controls the learning rate, i.e., the rate at which our model learns patterns in data. After every round, it shrinks the feature weights to reach the best optimum. Lower eta leads to slower computation. It must be supported by increase in nrounds. Typically, it lies between 0.01 - 0.3

  5. max_depth[default=6][range: (0,Inf)] It controls the depth of the tree. Larger data sets require deep trees to learn the rules from data. Should be tuned using CV

  y_varname= "day_value"
  varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
  prenres = paste(y_varname, "|", varstring, sep = "")
  sub_mat = subset_grep(inde_var, prenres)
    
  x_train = sub_mat[training, ]
  y_train = sub_mat[training, y_varname]
    
  x_test = sub_mat[test, ]
  y_test = sub_mat[test, y_varname]
 
  df_train = data.table(x_train, keep.rownames = F)
  df_test = data.table(x_test, keep.rownames = F)
  formu = as.formula(paste(y_varname, "~.", sep = ""))
  dfmatrix_train = sparse.model.matrix(formu, data = df_train)[, -1]
  dfmatrix_test = sparse.model.matrix(formu, data = df_test)[, -1]
 
  train_b = xgb.DMatrix(data = dfmatrix_train, label = y_train) 
  test_b = xgb.DMatrix(data = dfmatrix_test, label = y_test) 
  params <- list(booster = "gbtree",max_depth = 4,
  eta = 0.05,
  nthread = 2,
  nrounds = 1000,
  Gamma = 2)
  #xgb_t = xgb.train (params = params, data = train_b, nrounds = 500, watchlist = list(val=test_b, train = train_b), print_every_n = 10, early_stopping_rounds = 50, maximize = F , eval_metric = "rmse")

  #outputvec = inde_var[training, y_varname]
  max_depth = 4
  eta = 0.01
  nthread = 4
  nrounds = 1000
  Gamma = 2
  
  #simplest: tunning of rounds 
  xgbcv <- xgb.cv( data = train_b, nfold = 10, nround =  nrounds, eta = eta, nthread = nthread, Gamma = Gamma,showsd = T, stratified = T, print_every_n = 200, early_stopping_rounds = 50, maximize = F)
## [1]  train-rmse:29.275966+0.273696   test-rmse:29.259226+2.462071 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [201]    train-rmse:5.620102+0.070028    test-rmse:11.048028+1.523232 
## [401]    train-rmse:1.485326+0.060579    test-rmse:10.108446+1.349093 
## Stopping. Best iteration:
## [459]    train-rmse:1.100368+0.058864    test-rmse:10.085322+1.326256
  xgbcv
## ##### xgb.cv 10-folds
##     iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
##        1      29.2759661     0.27369602       29.25923      2.462071
##        2      29.0138133     0.27126430       29.01890      2.452536
##        3      28.7542561     0.26892306       28.78637      2.442605
##        4      28.4973000     0.26669302       28.54876      2.432511
##        5      28.2430079     0.26435966       28.32009      2.417639
## ---                                                                 
##      505       0.8870495     0.05612478       10.09810      1.322524
##      506       0.8831940     0.05621285       10.09850      1.322775
##      507       0.8793501     0.05608361       10.09852      1.322913
##      508       0.8754825     0.05594716       10.09839      1.323030
##      509       0.8714984     0.05592954       10.09831      1.322678
## Best iteration:
##  iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
##   459        1.100368     0.05886373       10.08532      1.326256
  str(xgbcv)
## List of 9
##  $ call           : language xgb.cv(data = train_b, nrounds = nrounds, nfold = 10, showsd = T,      stratified = T, print_every_n = 200, early| __truncated__ ...
##  $ params         :List of 4
##   ..$ eta    : num 0.01
##   ..$ nthread: num 4
##   ..$ Gamma  : num 2
##   ..$ silent : num 1
##  $ callbacks      :List of 3
##   ..$ cb.print.evaluation:function (env = parent.frame())  
##   .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n, showsd = showsd)
##   .. ..- attr(*, "name")= chr "cb.print.evaluation"
##   ..$ cb.evaluation.log  :function (env = parent.frame(), finalize = FALSE)  
##   .. ..- attr(*, "call")= language cb.evaluation.log()
##   .. ..- attr(*, "name")= chr "cb.evaluation.log"
##   ..$ cb.early.stop      :function (env = parent.frame(), finalize = FALSE)  
##   .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize,      verbose = verbose)
##   .. ..- attr(*, "name")= chr "cb.early.stop"
##  $ evaluation_log :Classes 'data.table' and 'data.frame':    509 obs. of  5 variables:
##   ..$ iter           : num [1:509] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ train_rmse_mean: num [1:509] 29.3 29 28.8 28.5 28.2 ...
##   ..$ train_rmse_std : num [1:509] 0.274 0.271 0.269 0.267 0.264 ...
##   ..$ test_rmse_mean : num [1:509] 29.3 29 28.8 28.5 28.3 ...
##   ..$ test_rmse_std  : num [1:509] 2.46 2.45 2.44 2.43 2.42 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ niter          : int 509
##  $ nfeatures      : int 67
##  $ folds          :List of 10
##   ..$ : int [1:26] 99 49 148 193 65 257 16 10 27 169 ...
##   ..$ : int [1:26] 95 259 46 137 167 183 110 207 7 155 ...
##   ..$ : int [1:26] 103 54 147 214 15 154 29 125 129 243 ...
##   ..$ : int [1:26] 108 22 237 98 198 251 165 72 12 78 ...
##   ..$ : int [1:26] 190 63 11 256 140 17 56 94 187 168 ...
##   ..$ : int [1:26] 206 132 241 133 246 64 231 77 116 156 ...
##   ..$ : int [1:26] 176 218 123 255 113 220 26 199 90 138 ...
##   ..$ : int [1:26] 192 83 224 182 75 44 180 85 55 57 ...
##   ..$ : int [1:26] 233 178 24 53 185 96 235 82 158 191 ...
##   ..$ : int [1:33] 86 238 81 170 248 157 260 144 88 102 ...
##  $ best_iteration : int 459
##  $ best_ntreelimit: num 459
##  - attr(*, "class")= chr "xgb.cv.synchronous"
  bst <- xgboost(data = train_b, max_depth = max_depth, eta = eta, nthread = nthread, nrounds = xgbcv$best_iteration, Gamma = Gamma, verbose = 1, print_every_n = 200, early_stopping_rounds = 50, maximize = F )
## [1]  train-rmse:29.277185 
## Will train until train_rmse hasn't improved in 50 rounds.
## 
## [201]    train-rmse:6.472103 
## [401]    train-rmse:3.211355 
## [459]    train-rmse:2.870294
  xgbpre = predict(bst, test_b)
  error_matrix(y_test, xgbpre)
##      RMSE       MAE       IQR 
## 11.688346  8.401553 11.580907
varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
xgboost_LUR(inde_var, max_depth =4, eta =0.02, nthread = 2, nrounds = 2000, y_varname= c("day_value"),training = training, test = test, grepstring = varstring )
##               Feature        Gain       Cover   Frequency
##  1:    ROAD_M345_3000 0.440978815 0.030722455 0.024549348
##  2:         ROAD_2_50 0.162633167 0.010285872 0.008963251
##  3:             pop1k 0.042820990 0.025945678 0.022856289
##  4:       ROAD_1_5000 0.035491080 0.040208195 0.038392590
##  5:     ROAD_M345_300 0.030985638 0.036846602 0.044866049
##  6:      ROAD_M345_25 0.030677120 0.049502260 0.106314112
##  7:    ROAD_M345_5000 0.017652947 0.034927037 0.030524848
##  8:        ROAD_2_100 0.015427478 0.035027348 0.021063639
##  9:        ROAD_2_300 0.013518111 0.037221001 0.022109352
## 10:             pop3k 0.013220215 0.008762371 0.008266109
## 11: wind_speed_10m_10 0.012560588 0.002381560 0.006124888
## 12:    ROAD_M345_1000 0.011740863 0.045738010 0.037147694
## 13:       ROAD_2_3000 0.011522640 0.047562916 0.039637486
## 14:          I_1_1000 0.009936516 0.023094587 0.021113435
## 15:       ROAD_2_5000 0.009075646 0.034851215 0.030275869
##      RMSE       MAE       IQR 
## 11.644071  8.558915 12.994102
#xgboost_imp (variabledf = inde_var, y_varname = "day_value", max_depth = 5, eta = 0.02, nthread = 4, nrounds = 2000, training = training, test = test, grepstring = varstring )

spatial correlation of errors of random forest

set.seed(2)
pr = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "CN", grepstring_rm = "ID|countryfullname")
inde_var_train = with(pr, inde_var[training, ])
inde_var_test = with(pr, inde_var[test, ])
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
rf
## Ranger result
## 
## Call:
##  ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000,      importance = "permutation") 
## 
## Type:                             Regression 
## Number of trees:                  2000 
## Sample size:                      1009 
## Number of independent variables:  65 
## Mtry:                             33 
## Target node size:                 5 
## Variable importance mode:         permutation 
## Splitrule:                        variance 
## OOB prediction error (MSE):       33.45173 
## R squared (OOB):                  0.7693244
errordf = with(inde_var_test, data.frame(error = predictions(predict(rf, inde_var_test)) - value_mean, LONGITUDE = LONGITUDE, LATITUDE = LATITUDE))
                 
error_sp = errordf %>% st_as_sf(coords = c("LONGITUDE","LATITUDE")) %>% as_Spatial
dt.vgm = variogram(error~1, error_sp, cutoff = 1)
plot(dt.vgm)

LASSO

In Sequence, mean, day , night. The predicton errors are much higher than random forest, but used a much simpler model The variables selected are slightly different from each other. The variables selected each time are also different.

Lasso(inde_var, vis1 = T, y_varname = "day_value", training = training, test=test)
## [1] 1.083474
## [1] 2.502967

##      RMSE       MAE       IQR 
## 12.750879  9.378781  9.949547
Lasso(inde_var, vis1 = T, y_varname = "night_value", training = training, test=test)
## [1] 0.2901371
## [1] 1.410821

##     RMSE      MAE      IQR 
## 6.572639 5.415657 8.888713